In [30]:
from os.path import join, dirname
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json

from sys import path
from os import getcwd

# Sort out paths so that this can use functions from the main codebase
path.insert(0, dirname(getcwd()))

import environment
from scan3 import settings

In [35]:
# Load up some data to work with
data_fname = join(settings.DATA_IN_ROOT, "data_staging", "all_by_baby_enriched_v3.csv")
df = pd.read_csv(data_fname)
print("Loaded {} rows of data".format(len(df)))


Loaded 63788 rows of data

Overview

Define some functions that tidy up and normalise the choices for each categorical field.

Generate some lookup tables that can be incorporated in the main pipeline.

From the initial discussion with Basky the ethnicity fields need some additional attention:

  • Ethnicity, there are two fields for this so we need to join them
    • Four categories: Caucasian, Afro (most at risk), Asian, Other

Ethnicity

We have four target categories, which are simple enough from the group2 values, so first need to look at whether that field is always filled out or whether we sometimes have to use the first field.

A quick glance suggests that the first field is more specific, but is filled out less than the second (may be due to different centers).


In [39]:
# This helps us to figure out where we need to look at both fields to help figure out the ethnic group
MISSING_ETHNIC_VALUES = ("missing",   
                         "not specified", "not stated", "not given", 
                         "patient unwilling to disclose")

OTHER_ETHNIC_VALUES = ("other", "other mixed", "mixed", "mixed other", "other mixed race", 
                       "any other group", "other ethnic group", "mixed ethnic group")


def make_mapper(target_name, synonyms):
    def scan_synonyms(fname, val):
        if (isinstance(val, float) and np.isnan(val)):
            return target_name
        else:
            # Remove annoying characters and collapse multiple spaces
            val = re.sub(" {2,10}", " ", 
                         re.sub("and|backgrou|back gro|back ground| und| unspecif", "", 
                                re.sub("wb", "white british", 
                                       re.sub("[_/()-]", " ", val.strip().lower()))))
            if val is None or val in synonyms:
                return target_name
            else:
                return val
    return scan_synonyms


ETHNIC_FIELDS = ["dem_ethnic_group", "dem_ethnic_group2"]

def tidy_missing_other(df):
    """
    Sort out missing values, so we can make it easier to process and analyse this data
    """
    map_missing = make_mapper("missing", MISSING_ETHNIC_VALUES)
    map_other = make_mapper("other", OTHER_ETHNIC_VALUES)

    for fname in ETHNIC_FIELDS:
        df[fname] = df[fname].map(lambda x: map_missing(fname, x))
        df[fname] = df[fname].map(lambda x: map_other(fname, x))
        
    return df

def generate_report(df):
    cdfs = {}
    for fname in ETHNIC_FIELDS:
        vals = list(set(df[fname]))
        counts = [len(df[df[fname] == val] == True) for val in vals]
        count_df = pd.DataFrame({"count": counts, "pct": np.array(counts) / float(len(df))}, index=vals)
        count_df.sort_values(by="count", inplace=True, ascending=False)
        cdfs[fname] = count_df
        
    return cdfs

df = tidy_missing_other(df)
report = generate_report(df)

# This shows the distinct values for this field, so can expliciclty map typos etc, and see if the missing/other
# symonyms need updating
print("\n".join(sorted(report["dem_ethnic_group"].index)))
#print(df[df.dem_ethnic_group2 == "other"]["dem_ethnic_group"].head())


african
afro caribbean
afro carribean
albanian
algerian
angolan
arab
asian
asian chinese
asian filipino
asian mauritius
asian sri lankan
bangladeshi
black
black african
black asian
black british
black caribbean
black chinese
black other
black white
bosnian
british
british asian
caribbean
caribbean asian
caucasian
chinese
chinese white
columbian
croatian
cypriot part nt st
east african asian
ecuadorian
english
equador
eritrean
ethiopian
filipino
former ussr rep
german
ghanaian
greek
greek cypriot
gypsy romany
indian
indian british india
iranian
iraqi
irish
italian
japanese
kashmiri
kosovan
kurdish
latin american
malaysian
maroco
middle east
missing
mixed african
mixed asian
mixed black
mixed caribbean
nigerian
oriental
other
other african
other asian
other black
other filipino
other former yugosla
other latin american
other sri lankan
other turkish
other white
other white mixed eu
pakistani british pa
polish
portuguese
punjabi
scottish
serbian
sinhalese
somali
south asian
sri lankan
sudanese
tamil
turkish
turkish cypriot
ugan
vietnamese
welsh
white
white asian
white british
white cornish
white irish
white northern irish

In [21]:
#print(df[df.dem_ethnic_group2 == "other"][["dem_ethnic_group", "dem_ethnic_group2"]])

In [12]:
from collections import OrderedDict
import json

# Check whether we have anything in field1 when field2 is missing
g2_missing_g1 = set(df["dem_ethnic_group"][(df["dem_ethnic_group2"] == "missing") & (df["dem_ethnic_group"] != "missing")])

# TODO Need to do the same thing for other

# Yes, so we need to map these too
# print "\n".join(sorted(g2_missing_g1))

# So the values we need to map are the g2 values, plus these ones.
# Actually, the key should probably be both columns?

values_to_map = set(df.dem_ethnic_group2).union(g2_missing_g1)
keys = sorted(values_to_map)
dummy_lookup = OrderedDict((k, "") for k in keys)

print(json.dumps(dummy_lookup, indent=4))
# print "\n".join(map(lambda x: "\"{}\": ".format(x), sorted(values_to_map)))


{
    "arab": "",
    "asian": "",
    "black": "",
    "black - african": "",
    "black - caribbean": "",
    "black - other": "",
    "black british": "",
    "black-east asian": "",
    "black-south asian": "",
    "british": "",
    "caribbean": "",
    "caucasian": "",
    "east asian": "",
    "east asian (oriental)": "",
    "missing": "",
    "other": "",
    "other white unspecif": "",
    "pakistani/british pa": "",
    "polish": "",
    "south asian": "",
    "south asian-east asian": "",
    "white": "",
    "white-black": "",
    "white-east asian": "",
    "white-south asian": ""
}

In [14]:
ETHNIC_GROUP_MAP = {
    "arab": "caucasian", 
    "asian": "asian", 
    "black": "afro", 
    "black - african": "afro", 
    "black - caribbean": "afro", 
    "black - other": "afro", 
    "black british": "afro", 
    "black-east asian": "other", 
    "black-south asian": "other", 
    "british": "caucasian", 
    "caribbean": "afro", 
    "caucasian": "caucasian", 
    "east asian": "asian", 
    "east asian (oriental)": "asian", 
    "missing": "missing", 
    "other": "other",
    "other white unspecif": "caucasian", 
    "pakistani/british pa": "asian", 
    "polish": "caucasian", 
    "south asian": "asian", 
    "south asian-east asian": "asian", 
    "white": "caucasian", 
    "white-black": "other", 
    "white-east asian": "other", 
    "white-south asian": "other"
}

ETHNIC_GROUP_MAP_COMMENTS = {
    "british": "Assume caucasian as the person would likely have been more specific otherwise"
}

def ethnic_mapper(val):
    return ETHNIC_GROUP_MAP.get(val, val)

# To map these, we try all the g2 fields, then deal with missing and other afterwards
df["dem_ethnic_group_norm"] = df.dem_ethnic_group2.map(ethnic_mapper)

for val in ("missing", "other", ):
    subset = (df.dem_ethnic_group2 == val)
    df.loc[subset, "dem_ethnic_group_norm"] = df.dem_ethnic_group.loc[subset].map(ethnic_mapper)

# Need to run some checks
def print_checks():
    for val in ("missing", "other", ):
        for fname in ("dem_ethnic_group2", "dem_ethnic_group", ):
            print("normed is {}".format(val))
            print("{} is".format(fname), sorted(set(df[df.dem_ethnic_group_norm == val][fname])))
        print()
    
# print df[(df.dem_ethnic_group_norm == "other") & (df.dem_ethnic_group== "nigerian")][["dem_ethnic_group", "dem_ethnic_group2", "dem_ethnic_group_norm"]]

# More specific tests, to make sure my mapping makes sense, need to encapsulate these in some actual tests for 
# when we get new data or change mappings
# groups = sorted(set(df[df.dem_ethnic_group_normed == "other"].dem_ethnic_group))
print("Normed is other")
for k, g in df[df.dem_ethnic_group_norm == "other"].groupby("dem_ethnic_group"):
    print("group is {}".format(k))
    print("group2 is ")
    print("\n".join(sorted(set(g.dem_ethnic_group2))))
    print()


Normed is other
group is algerian
group2 is 
white-black

group is arab
group2 is 
white-black
white-south asian

group is asian
group2 is 
black-south asian
white-east asian
white-south asian

group is black
group2 is 
black-south asian
white-black

group is black - african
group2 is 
black-south asian
white-black

group is black - caribbean
group2 is 
white-black

group is black - other
group2 is 
black-south asian
white-black

group is black african
group2 is 
white-black

group is black and asian
group2 is 
black-south asian

group is black and chinese
group2 is 
white-east asian

group is black and white
group2 is 
black-south asian
white-black

group is black british
group2 is 
black-south asian
white-black

group is british
group2 is 
black-south asian
white-black
white-east asian
white-south asian

group is british asian
group2 is 
white-black
white-east asian
white-south asian

group is caribbean
group2 is 
white-black

group is caribbean asian
group2 is 
black-south asian
white-black

group is caucasian
group2 is 
white-black
white-south asian

group is chinese
group2 is 
white-east asian

group is chinese and white
group2 is 
black-south asian
white-east asian
white-south asian

group is columbian
group2 is 
white-black

group is ecuadorian
group2 is 
white-black

group is english
group2 is 
white-black

group is eritrean
group2 is 
white-black

group is filipino
group2 is 
white-east asian

group is indian/british india
group2 is 
white-south asian

group is italian
group2 is 
white-south asian

group is japanese
group2 is 
white-east asian

group is missing
group2 is 
black-east asian
black-south asian
white-black
white-east asian
white-south asian

group is mixed african
group2 is 
white-black

group is mixed asian
group2 is 
black-east asian
white-east asian
white-south asian

group is mixed black
group2 is 
white-black
white-south asian

group is mixed caribbean
group2 is 
black-south asian
white-black

group is mixed ethnic group
group2 is 
black-south asian
white-black
white-east asian
white-south asian

group is nigerian
group2 is 
white-black

group is other
group2 is 
black-east asian
black-south asian
missing
other
white-black
white-east asian
white-south asian

group is other ( filipino )
group2 is 
white-east asian

group is other asian backgrou
group2 is 
white-east asian
white-south asian

group is other asian unspecif
group2 is 
black-south asian
white-black
white-east asian
white-south asian

group is other black backgrou
group2 is 
white-black

group is other black unspecif
group2 is 
white-black

group is other latin american
group2 is 
white-black

group is other mixed backgrou
group2 is 
white-black
white-east asian
white-south asian

group is other white back gro
group2 is 
white-black

group is other white unspecif
group2 is 
white-black
white-south asian

group is other white/mixed eu
group2 is 
white-black
white-south asian

group is pakistani/british pa
group2 is 
black-south asian
white-south asian

group is portuguese
group2 is 
white-black

group is white
group2 is 
white-black
white-south asian

group is white and asian
group2 is 
black-south asian
white-black
white-east asian
white-south asian


In [42]:
set(df.dem_ethnic_group + " | " + df.dem_ethnic_group2)


Out[42]:
{'african | black',
 'afro caribbean | black',
 'afro carribean | black',
 'albanian | white',
 'algerian | black',
 'algerian | white',
 'algerian | white black',
 'angolan | black',
 'angolan | white',
 'arab | black',
 'arab | east asian',
 'arab | missing',
 'arab | white',
 'arab | white black',
 'arab | white south asian',
 'asian chinese | east asian',
 'asian chinese | south asian',
 'asian filipino | east asian oriental',
 'asian mauritius | south asian',
 'asian sri lankan | south asian',
 'asian | black',
 'asian | black south asian',
 'asian | east asian',
 'asian | east asian oriental',
 'asian | missing',
 'asian | south asian',
 'asian | south asian east asian',
 'asian | white',
 'asian | white east asian',
 'asian | white south asian',
 'bangladeshi | black',
 'bangladeshi | south asian',
 'black african | black',
 'black african | black south asian',
 'black african | missing',
 'black african | south asian',
 'black african | white',
 'black african | white black',
 'black asian | black',
 'black asian | black south asian',
 'black asian | south asian',
 'black asian | white',
 'black british | black',
 'black british | black south asian',
 'black british | missing',
 'black british | white',
 'black british | white black',
 'black caribbean | black',
 'black caribbean | missing',
 'black caribbean | south asian',
 'black caribbean | white',
 'black caribbean | white black',
 'black chinese | white east asian',
 'black other | black',
 'black other | black south asian',
 'black other | missing',
 'black other | white black',
 'black white | black',
 'black white | black south asian',
 'black white | white black',
 'black | black',
 'black | black south asian',
 'black | white black',
 'bosnian | white',
 'british asian | east asian',
 'british asian | south asian',
 'british asian | white',
 'british asian | white black',
 'british asian | white east asian',
 'british asian | white south asian',
 'british | black',
 'british | black south asian',
 'british | east asian',
 'british | missing',
 'british | other',
 'british | south asian',
 'british | white',
 'british | white black',
 'british | white east asian',
 'british | white south asian',
 'caribbean asian | black',
 'caribbean asian | black south asian',
 'caribbean asian | east asian',
 'caribbean asian | white',
 'caribbean asian | white black',
 'caribbean | black',
 'caribbean | missing',
 'caribbean | white',
 'caribbean | white black',
 'caucasian | black',
 'caucasian | missing',
 'caucasian | south asian',
 'caucasian | white',
 'caucasian | white black',
 'caucasian | white south asian',
 'chinese white | black south asian',
 'chinese white | white',
 'chinese white | white east asian',
 'chinese white | white south asian',
 'chinese | east asian',
 'chinese | east asian oriental',
 'chinese | south asian',
 'chinese | south asian east asian',
 'chinese | white',
 'chinese | white east asian',
 'columbian | black',
 'columbian | white',
 'columbian | white black',
 'croatian | white',
 'cypriot part nt st | white',
 'east african asian | south asian',
 'east african asian | white',
 'ecuadorian | white',
 'ecuadorian | white black',
 'english | white',
 'english | white black',
 'equador | white',
 'eritrean | black',
 'eritrean | white black',
 'ethiopian | black',
 'filipino | east asian',
 'filipino | south asian',
 'filipino | south asian east asian',
 'filipino | white',
 'filipino | white east asian',
 'former ussr rep | south asian',
 'former ussr rep | white',
 'german | white',
 'ghanaian | black',
 'ghanaian | white',
 'greek cypriot | white',
 'greek | white',
 'gypsy romany | white',
 'indian british india | east asian',
 'indian british india | south asian',
 'indian british india | white',
 'indian british india | white south asian',
 'indian | south asian',
 'iranian | east asian',
 'iranian | south asian',
 'iranian | white',
 'iraqi | south asian',
 'iraqi | white',
 'irish | white',
 'italian | white',
 'italian | white south asian',
 'japanese | east asian',
 'japanese | white east asian',
 'kashmiri | south asian',
 'kosovan | white',
 'kurdish | south asian',
 'kurdish | white',
 'latin american | white',
 'malaysian | east asian',
 'maroco | white',
 'middle east | south asian',
 'middle east | white',
 'missing | black',
 'missing | black east asian',
 'missing | black south asian',
 'missing | east asian',
 'missing | east asian oriental',
 'missing | missing',
 'missing | other',
 'missing | south asian',
 'missing | south asian east asian',
 'missing | white',
 'missing | white black',
 'missing | white east asian',
 'missing | white south asian',
 'mixed african | black',
 'mixed african | white',
 'mixed african | white black',
 'mixed asian | black east asian',
 'mixed asian | south asian',
 'mixed asian | white',
 'mixed asian | white east asian',
 'mixed asian | white south asian',
 'mixed black | black',
 'mixed black | white',
 'mixed black | white black',
 'mixed black | white south asian',
 'mixed caribbean | black',
 'mixed caribbean | black south asian',
 'mixed caribbean | white',
 'mixed caribbean | white black',
 'nigerian | black',
 'nigerian | white',
 'nigerian | white black',
 'oriental | east asian',
 'oriental | white',
 'other african | black',
 'other asian | black',
 'other asian | black south asian',
 'other asian | east asian',
 'other asian | south asian',
 'other asian | south asian east asian',
 'other asian | white',
 'other asian | white black',
 'other asian | white east asian',
 'other asian | white south asian',
 'other black | black',
 'other black | white',
 'other black | white black',
 'other filipino | white east asian',
 'other former yugosla | white',
 'other latin american | black',
 'other latin american | white',
 'other latin american | white black',
 'other sri lankan | south asian',
 'other turkish | white',
 'other white mixed eu | south asian',
 'other white mixed eu | white',
 'other white mixed eu | white black',
 'other white mixed eu | white south asian',
 'other white | east asian',
 'other white | missing',
 'other white | south asian',
 'other white | white',
 'other white | white black',
 'other white | white south asian',
 'other | black',
 'other | black east asian',
 'other | black south asian',
 'other | east asian',
 'other | east asian oriental',
 'other | missing',
 'other | other',
 'other | south asian',
 'other | south asian east asian',
 'other | white',
 'other | white black',
 'other | white east asian',
 'other | white south asian',
 'pakistani british pa | black south asian',
 'pakistani british pa | missing',
 'pakistani british pa | south asian',
 'pakistani british pa | white',
 'pakistani british pa | white south asian',
 'polish | missing',
 'polish | white',
 'portuguese | black',
 'portuguese | white',
 'portuguese | white black',
 'punjabi | south asian',
 'scottish | white',
 'serbian | white',
 'sinhalese | south asian',
 'somali | black',
 'somali | south asian',
 'south asian | east asian',
 'south asian | south asian',
 'sri lankan | east asian',
 'sri lankan | south asian',
 'sudanese | black',
 'tamil | east asian',
 'turkish cypriot | white',
 'turkish | white',
 'ugan | black',
 'vietnamese | east asian',
 'vietnamese | south asian',
 'vietnamese | south asian east asian',
 'welsh | white',
 'white asian | black south asian',
 'white asian | east asian',
 'white asian | south asian',
 'white asian | white',
 'white asian | white black',
 'white asian | white east asian',
 'white asian | white south asian',
 'white british | white',
 'white cornish | white',
 'white irish | white',
 'white northern irish | white',
 'white | missing',
 'white | south asian',
 'white | white',
 'white | white black',
 'white | white south asian'}

In [54]:
df["dem_ethnic_key"] = df.dem_ethnic_group + " | " + df.dem_ethnic_group2
map_file = join(settings.DATA_IN_ROOT, "ethnicity_map.json")
with open(map_file, "rb") as f:
    ethnic_map = json.load(f)

def ethnic_mapper(k):
    return ethnic_map.get(k, "unknown")
df["dem_ethnic_norm"] = df.dem_ethnic_key.map(ethnic_mapper)

report = {}
for k, sdf in df.groupby("dem_ethnic_norm"):
    report[k] = "{:,}, {:.0%}".format(len(sdf), float(len(sdf)) / len(df))
print(report)


{'other', 'caucasian', 'asian', 'afro', 'missing'}
{'afro': '7,951, 12%', 'asian': '8,983, 14%', 'caucasian': '34,735, 54%', 'missing': '10,718, 17%', 'other': '1,401, 2%'}